Let’s take a look at the dataset

instacart
## # A tibble: 1,384,617 × 15
##    order_id product_id add_to_cart_order reordered user_id eval_set order_number
##       <int>      <int>             <int>     <int>   <int> <chr>           <int>
##  1        1      49302                 1         1  112108 train               4
##  2        1      11109                 2         1  112108 train               4
##  3        1      10246                 3         0  112108 train               4
##  4        1      49683                 4         0  112108 train               4
##  5        1      43633                 5         1  112108 train               4
##  6        1      13176                 6         0  112108 train               4
##  7        1      47209                 7         0  112108 train               4
##  8        1      22035                 8         1  112108 train               4
##  9       36      39612                 1         0   79431 train              23
## 10       36      19660                 2         1   79431 train              23
## # ℹ 1,384,607 more rows
## # ℹ 8 more variables: order_dow <int>, order_hour_of_day <int>,
## #   days_since_prior_order <int>, product_name <chr>, aisle_id <int>,
## #   department_id <int>, aisle <chr>, department <chr>

Data wrangling

instacart <- instacart %>%
  janitor::clean_names() %>%
  mutate(order_dow = case_when(
    order_dow == 0 ~ "Sun",
    order_dow == 1 ~ "Mon",
    order_dow == 2 ~ "Tue",
    order_dow == 3 ~ "Wed",
    order_dow == 4 ~ "Thu",
    order_dow == 5 ~ "Fri",
    order_dow == 6 ~ "Sat",
    TRUE ~ as.character(order_dow)
  ),
  order_dow = factor(order_dow, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun")))

From the dataset, we know there are 15 variables in this dataset: order_id, product_id, add_to_cart_order, reordered, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order, product_name, aisle_id, department_id, aisle, department. With these information, I am going to create 3 distinct plots:

  1. A histogram showing the number of orders for each day of the week.

  2. A line plot showing the average number of Orders by hour within each day of the week.

  3. A box plot of distribution of Add to Cart Order for top 5 Products.

Histogram

plot_ly(instacart, x = ~order_dow) %>%
  add_histogram(name = "Count of Orders") %>%
  layout(title = "Distribution of Orders by Day of the Week",
         xaxis = list(title = 'Day of the Week'), 
         yaxis = list(title = 'Count of Orders'))

Line plot

instacart %>%
  group_by(order_dow, order_hour_of_day) %>%
  summarise(avg_orders = n() / n_distinct(user_id)) %>%
  arrange(order_dow, order_hour_of_day) %>%
  plot_ly(x = ~order_hour_of_day, 
          y = ~avg_orders, 
          color = ~order_dow,
          type = "scatter", 
          mode = "lines+markers", 
          marker = list(size = 7),
          line = list(width = 2)) %>%
  layout(title = "Average Number of Orders by Hour within each Day of the Week",
         xaxis = list(title = "Hour of the Day"),
         yaxis = list(title = "Average Number of Orders"),
         legend = list(title = list(text = 'Day of the Week')),
         showlegend = TRUE)

Box plot

top_products <- instacart %>%
  group_by(product_name) %>%
  tally() %>%
  top_n(5, wt = n) %>%
  pull(product_name)

subset_data <- instacart %>%
  filter(product_name %in% top_products)

plot_ly(data = subset_data, 
        y = ~add_to_cart_order, 
        x = ~product_name, 
        type = "box",
        orientation = "v") %>%
  layout(title = "Distribution of Add to Cart Order for Top 5 Products",
         yaxis = list(title = "Add to Cart Order"),
         xaxis = list(title = "Product Name", tickangle = 15))